library(usethis)
library(dplyr)
library(tidytuesdayR)
library(tidyverse)
#library(cancerprof) - need to put in directly from github but not reloaded automatically so may need new codeblock for reproducibility
library(tidycensus)
library(broom)
library(plotly)
library(gt)
library(gtsummary)
library(ggplot2)
library(janitor)
library(sf)
library(tigris)
library(naniar)
#library(usmap)  # for simple state mapping (need above)
library(ggthemes)  # optional, for prettier maps (need above

options(tigris_use_cache = TRUE)

1 Data

1.1 Data Load

## ---- Compiling #TidyTuesday Information for 2025-04-08 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "care_state.csv"

1.2 Data Exploration

#we need to group the averages versus the percentages
#There's Sepsis and Opioid prescription measures which are percentages which are not labeled as percentages
# conditions are the themes for the types of measures
hosp_dat3 <- hosp_dat2 %>% 
            mutate(typenum = case_when(
              str_detect(measure_name, '[Aa]verage') ~ 'avg', 
              str_detect(measure_name, '[Pp]ercentage|Safe\\sUse|Septic\\sShock|Severe\\sSepsis') ~ 'pct', 
              TRUE ~ NA
            ))

#Explore if anything has time for different measures
hosp_dat3 %>% 
    mutate(time_var = time_length(measure_len, unit='days')) %>% 
    group_by(measure_name) %>% 
   summarise(min_time = min(time_var, na.rm = TRUE),
             max_time = max(time_var, na.rm = TRUE)) %>% 
    filter(max_time!=min_time)
#alternatively:
hosp_dat3 %>% 
    mutate(time_var = time_length(measure_len, unit='days')) %>% 
    group_by(measure_name) %>% 
   reframe(ran_var = range(time_var, na.rm=TRUE)) %>% 
    print(n=42)
#So regardless of the measure the duration of the measure is the same

#There is some hard to interpret measures such as avg time spent in ED before being sent home with different 'levels' in parentheses but there's not really an explanation for this
hosp_dat3 %>% 
    filter(condition=='Emergency Department' & typenum=='avg') %>% 
    group_by(measure_name) %>% 
    summarise(median_scr = median(score, na.rm = TRUE))
hosp_dat3 %>% 
    filter(str_detect(measure_id, '^OP_?.*MIN?.*')) %>% 
    group_by(measure_id) %>% 
    reframe(min_score = min(score, na.rm=TRUE),
            max_score = max(score, na.rm = TRUE))

#it might be good just to use the 'raw' score of OP_18b and OP_18c because these categories are hard to discern
#The categories of LOW, MEDIUM, HIGH, and VERY HIGH are a bit hard to understand in these data. Those with very low scores don't have a 'very high' but those with the maximum score for 18b of 310 don't have a 

#18b and 18c seem to contain others
#Sep_1 may contain Sep_SH_3HR; sep_SH_6hr; SEV_sep_3hr; SEV_SEP_6HR
hosp_dat3 %>% 
    filter(str_detect(measure_id, '.*SEP_?.*')) %>% 
    select(state, condition, measure_id, score) %>% 
    pivot_wider(id_cols = state, names_from = measure_id, values_from=score)

1.3 Preliminary Plots

1.4 Import outside data

#2020 state population data
cen_totpop <- tidycensus::get_decennial('state', year=2020, variables = 'P1_001N')


# income and poverty levels
vars <- c(
  median_inc = "B19013_001",
  per_capita_inc = "B19301_001",
  gini_ind = "B19083_001",
  poverty_ct = "B17001_002",
  poverty_tot = "B17001_001"
)

#Note that the variables ending in 'E' are Estimates and the variables ending in 'M' are 'Margins of Error' when selecting 'wide'
acs_incdat <- get_acs(
  geography = "state",
  variables = vars,
  survey = "acs5",
  year = 2022,
  output = "wide"
)

#land area of the states

state_area_data <- readr::read_tsv('2024_Gaz_state_national.txt')

#Overdose data in the states?
ovrdose_st <- read_csv('drug_mortality_bystate_2022.csv')

ovrdose_st2 <- ovrdose_st %>% 
                filter(YEAR==2022) %>% 
                mutate(STATE = case_when(
                  STATE=='District of Columbia' ~ 'DC',
                  TRUE ~ STATE
                )) %>% 
                  select(-c(URL)) %>% 
                  rename(Rate_per_100k = RATE,
                         Year_dat = YEAR,
                         Overdose_deaths = DEATHS,
                         state = STATE)

1.5 Potential additional descriptive analyses

1.5.1 Exploring data for 3 vs. 6 hour bundles

#SEP_SH_3HR vs. SEP_SH_6HR; SEV_SEP_3HR vs. SEV_SEP_6HR
#3hr vs. 6 hr bundles for sepsis 
hosp_dat3 %>% 
    filter(str_detect(measure_id, '.*SEP_?.*')) %>% 
    select(state, condition, measure_id, score) %>% 
    pivot_wider(id_cols = state, names_from = measure_id, values_from=score) %>% 
    mutate(thrvssix_shock = (SEP_SH_6HR-SEP_SH_3HR)/SEP_SH_3HR,
           thrvssix_svr = (SEV_SEP_6HR - SEV_SEP_3HR)/SEV_SEP_3HR
    ) 

1.5.2 Overdoses to correct opioid prescribing. Here, the data on overdose is the year to year and a half prior to the prescribing standards measure in the dataset

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).

1.5.3 Not many descriptive patterns except very high overdose rates in DC and WV but not in others with similar scores. Lower than Median scores in 2023-2024 but more than median drug overdoses in 2022

2 PCA with ranked variables

First states or variables without informative data, enough data or no data are dropped from these analyses. Many of the territories had no data. Additionally OP_31 was only present for 12 states.

The data consists of a mix of higher is better and lower is better, as well as absolute and percentage scores. Here, the ‘lower is better scores’, OP_22, SAFE_USE_OF_OPIOIDS, OP_18b, and OP18c are flipped.

2.1 The first analysis is a PCA of the covariance matrix of ranked scores.

# Step 3: Rank all numeric variables (but keep the ID variable as it is)
hosp_ranked <- hospdat_flipped %>%
  mutate(across(where(is.numeric), ~rank(-., ties.method = "min"))) #giving lower ranks to higher scores and handling ties as the minimum possible rank and same values

# Step 4: Extract numeric data (for PCA) and drop the ID variable (State)
ranked_numeric <- hosp_ranked %>%
  select(-all_of(id_var)) #syntax is a bit weird for one variable but needed since I'm calling an external vector

# Step 5: Compute Spearman correlation matrix
cor_matrix <- cor(ranked_numeric, method = "spearman")

# Step 6: Perform PCA
pca_result <- prcomp(cor_matrix, center = TRUE, scale. = TRUE)

2.1.1 Visualizing the variance with a screeplot:

2.1.2 Additionally looking at the Principle Components (PCs) explaining the variance

# Step 7a: View eigenvalues and % variance explained
eigenvalues <- pca_result$sdev^2
prop_variance <- eigenvalues / sum(eigenvalues)
cum_variance <- cumsum(prop_variance)

variance_explained <- data.frame(
  PC = paste0("PC", 1:length(eigenvalues)),
  Eigenvalue = eigenvalues,
  Proportion = round(prop_variance, 3),
  Cumulative = round(cum_variance, 3)
)

print(variance_explained)
##    PC   Eigenvalue Proportion Cumulative
## 1 PC1 3.638907e+00      0.455      0.455
## 2 PC2 1.811141e+00      0.226      0.681
## 3 PC3 1.112091e+00      0.139      0.820
## 4 PC4 6.627018e-01      0.083      0.903
## 5 PC5 5.165507e-01      0.065      0.968
## 6 PC6 2.552873e-01      0.032      1.000
## 7 PC7 3.321550e-03      0.000      1.000
## 8 PC8 1.262481e-33      0.000      1.000

2.1.3 Choosing the first 3 PCs as they explain 80+% of the variance for the PCA of covariance matrix. adding the individual variable’s contributions to the loadings for these PCs

#Visualization for these total contributions of variables

total_cont <- ranked_contrib %>% 
              tibble(
            variable = names(.),  # The names of the vector are the variable names
  contribution = .) 

ggplot(total_cont, aes(x = reorder(variable, contribution), y = contribution)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Variable Contributions to PCA (PC1, PC2, PC3)", 
       x = "Variable", y = "Contribution") +
  theme_minimal()

# Optional: Plot a biplot of the PCA result
biplot(pca_result, scale = 0, main = "PCA on Ranked and Transformed Variables")

For this analysis both Safe prescribing of Opioids and general care are most important. OP_29 represents the percentage of patients receiving appropriate recommendation for follow-up screening of a colonoscopy. OP_23, percentage of patients with stroke symptoms presenting at the ED who received brain scan results within 45 minutes and SEP_1, percentage of patients receiving apprioriate care for sepsis and septic shock, nextcontribute to these PCs.

The biplot of PC1 and PC2 shows similar contributions for OP_18b and OP_18c to PC1, time in emergency department for patients, and time in emergency department for pyschiatric patients. This is opposed or negatively correlated to OP_29. This may hint towards differences between Emergency Department efficiency and general practice care for screenings. What is interesting is the also the PC2 results, with Sepsis care being negatively coorelated to Safe Opioid prescribing. This may represent differences in pharmacovigilance, and pain management vs. quality of care for infectious and life-threatening disease, along PC2.

2.2 The second PCA examines simply the ranks of the scores by state in order to look at state groupings in the data.

2.2.1 Examining Eigenvalues and cumulativevariance for selecting PCs

Again Eigenvalues are examined as well as a scree plot These results have more PCs, here 4 are chosen for ~77% of the variance of the data

##    PC Eigenvalue Proportion Cumulative
## 1 PC1 2.54053571      0.318      0.318
## 2 PC2 1.50511484      0.188      0.506
## 3 PC3 1.13861459      0.142      0.648
## 4 PC4 0.97138875      0.121      0.769
## 5 PC5 0.70882473      0.089      0.858
## 6 PC6 0.58330207      0.073      0.931
## 7 PC7 0.48519686      0.061      0.992
## 8 PC8 0.06702245      0.008      1.000

2.2.2 Examining the loadings and contributions for the variables out of curiosity although this analysis is more aimed at grouping States for patterns based on geography.

# Step 8: Rank variable contributions to the first 4 PCs for the state data (optional)
loadings2 <- pca_scaled_states$rotation
contrib_pc1b <- loadings2[, 1]^2
contrib_pc2b <- loadings2[, 2]^2
contrib_pc3b <- loadings2[, 3]^2
contrib_pc4b <- loadings2[, 4]^2
total_contrib2 <- contrib_pc1b + contrib_pc2b + contrib_pc3b + contrib_pc4b


# Rank by total contribution
ranked_contrib2 <- sort(total_contrib2, decreasing = TRUE)
#print(ranked_contrib2)


#not sure this is necessary because does total contributions for variables with state groupings analysis
total_cont2 <- ranked_contrib2 %>% 
              tibble(
            variable = names(.),  # The names of the vector are the variable names
  contribution = .) 

print(total_contrib2)
##               IMM_3              OP_18b              OP_18c               OP_22 
##           0.4544104           0.4051159           0.3902834           0.3129296 
##               OP_23               OP_29 SAFE_USE_OF_OPIOIDS               SEP_1 
##           0.8942038           0.6521496           0.5384184           0.3524891

2.2.3 Preparing the data for the groupings of the states and performing Kmeans clustering

# Extracting the state scores (projections) onto the PCs

state_names <- hosp_ranked[[id_var]]  # e.g., state abbreviations

# Get the PCA scores (these are your transformed coordinates in PC space)
state_scores <- as.data.frame(pca_scaled_states$x[, 1:4])  # taking the first 4 PCs as Eigen value ~1 or more ; explains 77% of the variance
state_scores$state <- state_names


set.seed(555)  # set seed to reproduce results
kmeans_result <- kmeans(state_scores[, 1:4], centers = 3, nstart = 25)
state_scores$cluster <- kmeans_result$cluster

# Aggregating the 4 PCs contributions to the state clusters:
cluster_agg <- aggregate(. ~ cluster, data = state_scores[, c("PC1", "PC2", "PC3", "PC4", "cluster")], mean)

2.2.4 3D Plotly to examine the first 3 PCs with the clusters

2.2.5 Examining the 4 PCs for the state data

2.2.6 Contributions of the 4 PCs for the clusters

cluster_summary <- state_scores %>%
     group_by(cluster) %>%
     summarise(across(PC1:PC4, mean, .names = "mean_{.col}"))

2.3 Mapping

2.3.1 Data preparation using custom functions for spatial data processing

# Join abbreviations and names for help joining with map datasets

statekey <- cbind(state.name, state.abb) %>% 
            rbind(c('District of Columbia', 'DC')) %>% 
            as_tibble(.) %>% 
            rename(
                'region' = 'state.name',
                'stabb' = 'state.abb'
            ) %>% 
             arrange(region) %>% 
            rbind(c('Puerto Rico', 'PR')) %>% 
            mutate(region=tolower(region)) 
            

#Edited code from ChatGPT with some modification for joins

# Get state geometries
states_formap <- tigris::states(cb = TRUE, resolution = "20m", class = "sf") %>% 
                 st_transform(2163)  # Albers Equal Area projection

# Check CRS of the full data
#st_crs(states_formap)  # should be EPSG:4269 (NAD83)

# Alaska, explicitly resetting CRS after filtering
alaska <- states_formap %>%
    filter(STUSPS == "AK") %>%
   # st_set_crs(4269) %>%  # Ensure original CRS is set (NAD83)
    st_transform(2163) %>%  # Transform to Albers projection (EPSG:2163)
    st_scale4_crs(0.35) %>%  # Scale with CRS preservation
    st_shift4_crs(c(700000, - 4900000)) %>% #manually puts just south of CA
    rotate_geometry2(., -40) #manually roates the state for inset


# Hawaii
hawaii <- states_formap %>% filter(STUSPS == "HI") %>%
  st_transform(2163) %>%
  st_shift4_crs(c(5000000, -1450000)) %>% #manual shift for index
  rotate_geometry2(., -40)


# Puerto Rico
puerto_rico <- states_formap %>% filter(STUSPS == "PR") %>%
  st_transform(2163) %>%
  st_shift4_crs(c(-1200000, -30000)) %>%  # manual shift for index
  rotate_geometry2(., 15)

# # Combine Alaska, Hawaii, and Puerto Rico with the rest of the states
remaining_states <- states_formap %>%
  filter(!(STUSPS %in% c("AK", "HI", "PR")))

# Combine the transformed data
states_combined <- bind_rows(remaining_states, alaska, hawaii, puerto_rico)

3 Map of Kmeans Clusters for PCAs in the US and Puerto Rico

# Plot
map1 <- left_join(states_combined, state_scores, by = c("STUSPS" = "state")) %>% 
    ggplot(.) +
    geom_sf(aes(fill = as.factor(cluster)), color = "white", size = 0.2) +
    scale_fill_manual(values = c("1" = "#1b9e77", "2" = "#d95f02", "3" = "#7570b3")) +
    labs(title = "U.S. Healthcare Clusters (from tigris map)", fill = "Cluster") +
    theme_void() +
    scale_fill_manual(values = c("1" = "#1b9e77", "2" = "#d95f02", "3" = "#7570b3"),
                      labels = c("1: High Acute Care Performance,\n   Lower opioid/clinical safety", 
                                 "2: Better safety scores & avg ED scores,\n   Poorer gen practice", 
                                 "3: Prevention-oriented,\n    Most ED scores poor")) + 
    labs(fill = "Cluster Type", 
         title = "State Healthcare System Clusters*",
         subtitle = "Based on PCA + Cluster Analysis of CMS Measures",
         caption = '*Includes Puerto Rico') #+
   # theme_void() +
   # theme(
        # Move legend slightly left, outside map area
    #    legend.position = c(1.1, 0.5),  # Shift from right to just a bit left
        # Add small margin at top and left
     #   plot.margin = margin(t = 2, r = 40, b = 2, l = -120)
    #)

map1

4 Secondary analysis of the scores with income and population density considered

4.1 Data prep

#poverty stats are by state name, so need to reconvert statekey to camel case and join
#state area data already has USPS abbreviations
#cen_totpop has names
#datasets are: acs_incdat, state_area_data, cen_totpop

extra_vars <- cen_totpop %>% 
              rename(pop2020 = value) %>% 
              select(-c(variable)) %>% 
              right_join(., acs_incdat, by = c('GEOID', 'NAME')) %>% 
              left_join(., state_area_data, by = c('GEOID', 'NAME')) %>% 
              rename(state = USPS) %>% 
              relocate(state, .before=everything()) %>% 
              select(c(state, pop2020, ALAND_SQMI, median_incE, gini_indE)) %>% 
              mutate(popdensity = pop2020/ALAND_SQMI,
                     gini_Eflip = 1 - gini_indE) %>% # the lower the GINI score, the more equality so flipping this
              select(-c(pop2020, ALAND_SQMI, gini_indE)) 


# Step 3: Rank all numeric variables (but keep the ID variable as it is)
hosp_ranked2 <- hospdat_flipped %>%
                left_join(., extra_vars, by = 'state') %>% 
                mutate(across(where(is.numeric), ~rank(-., ties.method = "min"))) #giving lower ranks to higher scores and handling ties as the minimum possible rank and same values

# Step 4: Extract numeric data (for PCA) and drop the ID variable (State)
ranked_numeric2 <- hosp_ranked2 %>%
  select(-all_of(id_var))

4.2 PCA for the states including the income inequality, median_income, and population density ranks

##      PC Eigenvalue Proportion Cumulative
## 1   PC1 3.44983431      0.314      0.314
## 2   PC2 1.80986477      0.165      0.478
## 3   PC3 1.45988244      0.133      0.611
## 4   PC4 1.07784251      0.098      0.709
## 5   PC5 0.91953672      0.084      0.792
## 6   PC6 0.64451396      0.059      0.851
## 7   PC7 0.53447404      0.049      0.900
## 8   PC8 0.49283058      0.045      0.944
## 9   PC9 0.37090233      0.034      0.978
## 10 PC10 0.17829570      0.016      0.994
## 11 PC11 0.06202264      0.006      1.000

4.2.1 Examining the loadings and contributions for the variables out of curiosity although this analysis is more aimed at grouping States for patterns based on geography.

# Step 8: Rank variable contributions to the first 5 PCs for the state data (optional) - later I work on 4 PCs given the EigenValues
loadings2b <- pca_scaled_states2$rotation
contrib_pc1c <- loadings2b[, 1]^2
contrib_pc2c <- loadings2b[, 2]^2
contrib_pc3c <- loadings2b[, 3]^2
contrib_pc4c <- loadings2b[, 4]^2
contrib_pc5c <- loadings2b[, 5]^2

total_contrib2b <- contrib_pc1c + contrib_pc2c + contrib_pc3c + contrib_pc4c + contrib_pc5c


# Rank by total contribution
ranked_contrib2b <- sort(total_contrib2b, decreasing = TRUE)
#print(ranked_contrib2)


#not sure this is necessary because does total contributions for variables with state groupings analysis, this includes PC5
total_cont2b <- ranked_contrib2b %>% 
              tibble(
            variable = names(.),  # The names of the vector are the variable names
  contribution = .) 

print(total_cont2b)
## # A tibble: 11 × 2
##    variable            contribution
##    <chr>                      <dbl>
##  1 OP_23                      0.678
##  2 median_incE                0.625
##  3 SAFE_USE_OF_OPIOIDS        0.623
##  4 OP_29                      0.547
##  5 SEP_1                      0.503
##  6 IMM_3                      0.446
##  7 gini_Eflip                 0.363
##  8 OP_22                      0.343
##  9 popdensity                 0.336
## 10 OP_18b                     0.276
## 11 OP_18c                     0.260
Loadings for the First 4 PCs
Healthcare Variable
Principal Components
PC1 Loading PC2 Loading PC3 Loading PC4 Loading
IMM_3 −0.173 0.501 0.150 −0.010
OP_18b 0.501 0.013 −0.008 0.142
OP_18c 0.486 0.032 0.031 0.137
OP_22 0.306 −0.116 0.385 −0.143
OP_23 −0.097 −0.073 0.510 0.634
OP_29 0.013 0.426 −0.123 0.575
SAFE_USE_OF_OPIOIDS 0.056 −0.335 −0.447 0.286
SEP_1 0.218 0.102 0.515 −0.215
median_incE −0.175 0.448 −0.078 −0.243
popdensity −0.460 −0.050 0.196 0.139
gini_Eflip 0.295 0.472 −0.209 0.036
Cluster Summary for Principal Components
State Cluster PC1 Mean PC2 Mean PC3 Mean PC4 Mean
1 0.180 1.172 0.019 0.140
2 2.196 −1.108 −0.173 0.171
3 −2.043 −0.784 0.114 −0.340

4.2.2 3D Plotly to examine the first 3 PCs with the clusters

4.3 Mapping the State Clusters with median income, GINI index and population density accounted for